#Loading Packages
#Will most likely add more
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(bulletxtrctr)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl_init' failed, running with rgl.useNULL = TRUE
library(x3ptools)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(readr)
library(furrr)
## Loading required package: future
library(stringr)
library(dichromat)
library(future)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(tidyr)
options(future.globals.maxSize = 4*1024*1024*1024)
#data_dir <- "/media/Raven/REU_Refit"
#load("Data_NeededV2.RData") #Load for all results faster
# Reading in Hamby_173
df <- tibble(path = list.files(path = "Hamby_173",
pattern = ".x3p", recursive = T,
full.names = T)) %>%
mutate(Barrel = str_extract(path, "(Unknown|Barrel)\\d{0,2}") %>%
str_remove("Barrel"),
Bullet = str_extract(path, "Bullet[12A-Z]") %>%
str_remove("Bullet"),
Land = str_extract(path, "land\\d{1}") %>%
str_remove("land")) %>%
mutate(Set = "Hamby173") %>%
mutate(x3p = future_map(path, read_x3p)) %>%
mutate(x3p = future_map(x3p, ~x3p_m_to_mum(.) %>% y_flip_x3p()))
# Reading in Hamby_252
df2 <- tibble(path = list.files(path = "Hamby_252",
pattern = ".x3p", recursive = T,
full.names = T)) %>%
mutate(Barrel = str_extract(path, "(Unknown|Barrel)\\d{0,2}") %>%
str_remove("Barrel"),
Bullet = str_extract(path, "Bullet[12A-Z]") %>%
str_remove("Bullet"),
Land = str_extract(path, "Bullet [12A-Z]-[123456]") %>%
str_remove("Bullet [12A-Z]-")) %>%
mutate(Set = "Hamby252") %>%
mutate(x3p = future_map(path,read_x3p)) %>%
# Adjust orientation
mutate(x3p = future_map(x3p, ~x3p_m_to_mum(.) %>% rotate_x3p(angle = -90) %>% y_flip_x3p()))
# One big data set - easier to debug the code if you do everything in one big go.
hamby <- bind_rows(df, df2)
hamby <- hamby %>%
mutate(id = paste(Set, Barrel, Bullet, Land, sep = "-")) %>%
select(id, Set, Barrel, Bullet, Land, x3p, path)
rm(df, df2)
plan(multicore) # use all the cores at once
hamby <- hamby %>%
mutate(
CrossSection = future_map_dbl(x3p, x3p_crosscut_optimize, minccf = 0.9, span = 0.3, percent_missing = 25))
head(select(hamby, -path, -x3p), 5)
## # A tibble: 5 x 6
## id Set Barrel Bullet Land CrossSection
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Hamby173-10-1-1 Hamby173 10 1 1 50
## 2 Hamby173-10-1-2 Hamby173 10 1 2 50
## 3 Hamby173-10-1-3 Hamby173 10 1 3 50
## 4 Hamby173-10-1-4 Hamby173 10 1 4 50
## 5 Hamby173-10-1-5 Hamby173 10 1 5 50
#hamby %>% arrange(desc(CrossSection))
plot1 = hamby %>%
filter(Barrel != "Unknown") %>%
ggplot(aes(x = Barrel, y = CrossSection, fill = Bullet))+
geom_boxplot()+
ggtitle("Barrels 1-10")
plot2 = hamby %>%
filter(Barrel == "Unknown") %>%
ggplot(aes(x = Barrel, y = CrossSection, fill = Bullet))+
geom_boxplot()+
ggtitle("Barrel Unknown")
grid.arrange(plot1, plot2)
Looking at the Boxplot for Barrels 1-10, we can see the inter-quartile ranges seem to be roughly the same for each Bullet. For Barrels 1-10, Bullet 2 seems to have higher values than Bullet 1. Barrels 6 and 7 have the largest whiskers. Barrel 1, Bullet 1 has the smallest inter-quartile range and Barrel 9, Bullet 2 has the largest interquartile range. There is one apparent outlier for Barrel 10, Bullet 1. This CrossSection Value is not high enough for concern when looking at all Barrels but should be examined separately.
Looking at the Boxplot for Barrel “Unknown” we can see an apparent outlier for Bullet B. This CrossSection value is 400. Unlike with Barrels 1-10 we can see the Bullets for Barrel “Unknown” have noticeably more lower whiskers.
#Cross Sections
hamby <- hamby %>%
mutate(CrossCut = future_map2(.x = x3p, .y = CrossSection, .f = x3p_crosscut))
crosscuts <- select(hamby, -path, -x3p) %>%
tidyr::unnest(CrossCut)
crosscuts %>%
arrange(desc(CrossSection))
## # A tibble: 636,728 x 9
## id Set Barrel Bullet Land CrossSection x y value
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Hamby252-Unk… Hamby2… Unkno… B 2 400 0 400 61.5
## 2 Hamby252-Unk… Hamby2… Unkno… B 2 400 1.56 400 61.5
## 3 Hamby252-Unk… Hamby2… Unkno… B 2 400 3.12 400 61.7
## 4 Hamby252-Unk… Hamby2… Unkno… B 2 400 4.69 400 61.9
## 5 Hamby252-Unk… Hamby2… Unkno… B 2 400 6.25 400 61.9
## 6 Hamby252-Unk… Hamby2… Unkno… B 2 400 7.81 400 62.2
## 7 Hamby252-Unk… Hamby2… Unkno… B 2 400 9.38 400 62.2
## 8 Hamby252-Unk… Hamby2… Unkno… B 2 400 10.9 400 62.3
## 9 Hamby252-Unk… Hamby2… Unkno… B 2 400 12.5 400 62.7
## 10 Hamby252-Unk… Hamby2… Unkno… B 2 400 14.1 400 63.0
## # … with 636,718 more rows
Note: For Hamby173 barrels 1, 2, and 4 for bullet 1 seem to have higher crosscut values plotted. For Hamby173 barrel 1, bullet 2 seems to have a higher crosscut values plotted. There doesn’t seem to be any noticeably low values for Hamby173.
Note: For Hamby 252 barrel 10 and 2, bullet 1 has high crosscut values plotted. There also seems to be some low crosscut values plotted as well.
#Looking at CrossCuts to see any issues
crosscuts %>%
filter(Barrel != "Unknown" & Set == "Hamby173") %>%
ggplot(data = ., aes(x = x, y = value, color = Land)) +
geom_line() +
facet_grid(Set + paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel))
crosscuts %>%
filter(Barrel != "Unknown" & Set == "Hamby252") %>%
ggplot(data = ., aes(x = x, y = value, color = Land)) +
geom_line() +
facet_grid(Set + paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel))
crosscuts %>%
filter(Set == "Hamby173" & Barrel == "Unknown") %>%
ggplot(aes(x = x, y = value)) +
geom_line() +
facet_grid(Bullet~Land, labeller="label_both") +
theme_bw()+ ggtitle("Barrel Unknown for Hamby173")
crosscuts %>%
filter(Set == "Hamby252" & Barrel == "Unknown") %>%
ggplot(aes(x = x, y = value)) +
geom_line() +
facet_grid(Bullet~Land, labeller="label_both") +
theme_bw()+ ggtitle("Barrel Unknown for Hamby252")
crosscuts %>% filter(id == "Hamby252-Unknown-B-2") %>%
ggplot(aes(x = x, y = value))+
geom_line()+
facet_grid(Bullet~Land + Set, labeller = "label_both")+
theme_bw() + ggtitle("High CrossSection Bullet B")
Looking at the CrossCuts we can see the side profile for a bullets land impression for every barrel in Hamby173 and Hamby252. For most of the CrossCuts the shoulders can be clearly identified but there are some that can not be “clearly” identified due to possbile tank rash, pitting, or scans that do not meet the standards that forensic analysis require. I can not say for certain if there are scans that should be excluded. I used FIX3P to examine the “groove to groove” scans and noticed considerable tank rash on some of the scans. There was some noticeable pitting as well but I only felt it was a concern when the pitting was concentrated in the center of the “groove to groove” scan.
When looking at the Crosscuts we can see most have noticeable curvature. There are some scans that do not and these scans can look like a flat line or even a traingle… I Have made note of these particular scans as well.
#Grooves
saved_grooves_location <- "V2H173_H252_Grooves_data.rda"
if (file.exists(saved_grooves_location)) {
hamby$Grooves <- readRDS(saved_grooves_location)
} else {
hamby <- hamby %>%
mutate(Grooves = CrossCut %>%
future_map(.f = cc_locate_grooves,
method = "rollapply", smoothfactor = 15, return_plot = T)) # use plot so that the shiny app works...
}
grooves <- hamby %>% tidyr::unnest(Grooves)
head(grooves, 10)
## # A tibble: 10 x 8
## id Set Barrel Bullet Land path CrossSection Grooves
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <list>
## 1 Hamby17… Hamby… 10 1 1 Hamby_173/Barr… 50 <dbl […
## 2 Hamby17… Hamby… 10 1 1 Hamby_173/Barr… 50 <S3: g…
## 3 Hamby17… Hamby… 10 1 2 Hamby_173/Barr… 50 <dbl […
## 4 Hamby17… Hamby… 10 1 2 Hamby_173/Barr… 50 <S3: g…
## 5 Hamby17… Hamby… 10 1 3 Hamby_173/Barr… 50 <dbl […
## 6 Hamby17… Hamby… 10 1 3 Hamby_173/Barr… 50 <S3: g…
## 7 Hamby17… Hamby… 10 1 4 Hamby_173/Barr… 50 <dbl […
## 8 Hamby17… Hamby… 10 1 4 Hamby_173/Barr… 50 <S3: g…
## 9 Hamby17… Hamby… 10 1 5 Hamby_173/Barr… 50 <dbl […
## 10 Hamby17… Hamby… 10 1 5 Hamby_173/Barr… 50 <S3: g…
head(select(hamby, -path, -x3p, -CrossCut), 5)
## # A tibble: 5 x 7
## id Set Barrel Bullet Land CrossSection Grooves
## <chr> <chr> <chr> <chr> <chr> <dbl> <list>
## 1 Hamby173-10-1-1 Hamby173 10 1 1 50 <list [2]>
## 2 Hamby173-10-1-2 Hamby173 10 1 2 50 <list [2]>
## 3 Hamby173-10-1-3 Hamby173 10 1 3 50 <list [2]>
## 4 Hamby173-10-1-4 Hamby173 10 1 4 50 <list [2]>
## 5 Hamby173-10-1-5 Hamby173 10 1 5 50 <list [2]>
plan(sequential) # stop furrr multicore processes - messes with shiny/rmd
Hamby_test <- hamby %>%
filter(Set == "Hamby252") %>%
filter(Barrel == 6)
Hamby_test2 <- hamby %>%
filter(Set == "Hamby252") %>%
filter(Barrel == 3 & Bullet == 1)
Hamby_test3 <- hamby %>%
filter(Set == "Hamby252") %>%
filter(Barrel == 1 & Bullet == 1)
Hamby_test4 <- hamby %>%
filter(Set == "Hamby252") %>%
filter(Barrel == 9 & Bullet == 2)
Hamby_test5 <- hamby %>%
filter(Set == "Hamby252") %>%
filter(Barrel == "Unknown") %>%
filter(Bullet == "B" | Bullet == "S" | Bullet == "U")
Hamby_test6 <- hamby %>%
filter(Set == "Hamby173") %>%
filter(Barrel == 3 & Bullet == 2)
Hamby_test7 <- hamby %>%
filter(Set == "Hamby173") %>%
filter(Barrel == "Unknown") %>%
filter(Bullet == "B" | Bullet == "E" | Bullet == "U")
gridExtra::grid.arrange(Hamby_test$Grooves[[1]]$plot,
Hamby_test$Grooves[[7]]$plot,
Hamby_test2$Grooves[[4]]$plot,
Hamby_test3$Grooves[[6]]$plot,
Hamby_test4$Grooves[[4]]$plot,
Hamby_test5$Grooves[[2]]$plot,
Hamby_test5$Grooves[[10]]$plot,
nrow = 2)
gridExtra::grid.arrange(Hamby_test6$Grooves[[1]]$plot,
Hamby_test7$Grooves[[3]]$plot,
Hamby_test7$Grooves[[15]]$plot,
Hamby_test7$Grooves[[12]]$plot,
nrow = 2)
rm(Hamby_test, Hamby_test2, Hamby_test3, Hamby_test4, Hamby_test5, Hamby_test6, Hamby_test7 )
library(shiny)
if (file.exists(saved_grooves_location)) {
hamby$Grooves <- readRDS(saved_grooves_location)
}
if (interactive()) { # only run when you're manually running chunks... don't run when the whole document is compiled.
shinyApp(
ui = fluidPage(
selectInput("k", "Investigate kth plot:",
selected = 1,
choices = (1:length(hamby$Grooves)) %>% set_names(hamby$id)
),
textOutput("groovelocations"),
actionButton("confirm", "Confirm"),
actionButton("save", "Save"),
plotOutput("groovePlot", click = "plot_click"),
verbatimTextOutput("info")
),
server = function(input, output, session) {
output$groovePlot <- renderPlot({
k <- as.numeric(input$k)
p <- hamby$Grooves[[k]]$plot
p
})
output$groovelocations <- renderText({
paste(
"Left Groove: ", hamby$Grooves[[as.numeric(input$k)]]$groove[1],
" Right Groove: ", hamby$Grooves[[as.numeric(input$k)]]$groove[2]
)
})
observeEvent(input$confirm, {
cat(paste(hamby$id[as.numeric(input$k)], "\n"))
updateSelectInput(session, "k", "Investigate kth plot:",
selected = as.numeric(input$k) + 1,
choices = (1:length(hamby$Grooves)) %>% set_names(hamby$id)
)
})
observeEvent(input$save, {
saveRDS(hamby$Grooves, file = saved_grooves_location)
message("groove data saved\n")
})
observeEvent(input$plot_click, {
k <- as.numeric(input$k)
xloc <- input$plot_click$x
gr <- hamby$Grooves[[k]]$groove
if (abs(gr[1] - xloc) < abs(gr[2] - xloc)) {
hamby$Grooves[[k]]$groove[1] <<- xloc
} else {
hamby$Grooves[[k]]$groove[2] <<- xloc
}
output$groovePlot <- renderPlot({
k <- as.numeric(input$k)
p <- hamby$Grooves[[k]]$plot +
geom_vline(xintercept = hamby$Grooves[[k]]$groove[1], colour = "green") +
geom_vline(xintercept = hamby$Grooves[[k]]$groove[2], colour = "green")
p
})
})
output$info <- renderText({
paste0("x=", input$plot_click$x, "\ny=", input$plot_click$y)
})
},
options = list(height = 500)
)
saveRDS(hamby$Grooves, file = saved_grooves_location)
} else {
if (!file.exists(saved_grooves_location)) {
message("run script in interactive mode to fix grooves")
} else {
hamby$Grooves <- readRDS(saved_grooves_location)
}
}
#Test Some Grooves(performed:06/12/19)
#Signatures
hamby <- hamby %>%
mutate(Signatures = future_map2(.x = CrossCut, .y = Grooves, .f = cc_get_signature, span = 0.75, span2 = .03))
Signatures <- hamby %>%
select(id, Set, Barrel, Bullet, Land, Signatures) %>%
tidyr::unnest()
#Looking for major issues - grooves not set correctly (large deviations at beginning or end of a line)
Signatures %>%
filter(Barrel != "Unknown") %>%
ggplot(data = ., aes(x = x, y = sig, color = Land)) +
geom_line()+
facet_grid(Set + paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel)) # put the barrels in the right order
## Warning: Removed 2733 rows containing missing values (geom_path).
Signatures %>%
filter(Set == "Hamby173") %>%
filter(Barrel == "Unknown") %>%
ggplot(data = ., aes(x = x, y = sig, color = Land)) +
geom_line()+
facet_wrap(paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel), ncol = 4)
## Warning: Removed 1682 rows containing missing values (geom_path).
Signatures %>%
filter(Set == "Hamby252") %>%
filter(Barrel == "Unknown") %>%
ggplot(data = ., aes(x = x, y = sig, color = Land)) +
geom_line()+
facet_wrap(paste("Bullet", Bullet) ~ sprintf("Barrel %02s", Barrel), ncol = 4)
## Warning: Removed 1550 rows containing missing values (geom_path).
load("Data_NeededV3.RData")
Examining the Signature Profiles we can see for barrels 1-10, for both Hamby173 + Hamby252, that the signatures for lands 1-6 are aligned pretty well. There seems to be some noticeable deviations at the beginning and end for Barrels 1-10 in both sets. I looked at the groove to groove scans using FIX3P and these spikes at the beginning and end for Barrels 1 and 10 can be identified. Groove locations seem to be set correctly for these lands.
There is some noticeable missing data for Signature comparisons [Hamby252/Bullet1] when filling by Bullet. This is hard to see when filling by the Lands.
We also examine the number of consecutively matching peaks in signatures for two aligned lands.(Case Validation Paper)
Groove Identifications Approved
comparisons <- crossing(Bullet1 = hamby$id, Bullet2 = hamby$id) %>%
left_join(nest(hamby, -id) %>% magrittr::set_names(c("Bullet1", "Bullet1_data"))) %>%
left_join(nest(hamby, -id) %>% magrittr::set_names(c("Bullet2", "Bullet2_data"))) %>%
mutate(Set1 = str_extract(Bullet1, "Hamby\\d{2,3}"),
Set2 = str_extract(Bullet2, "Hamby\\d{2,3}")) %>%
filter(Set1 == Set2) %>% # Get rid of cross-set comparisons for now...
select(-matches("Set"))
#plan(multicore(workers = availableCores(constraints = 8)))
get_sig <- function(data) {
purrr::map(data$Signatures, "sig")
}
comparisons <- comparisons %>%
mutate(sig1 = purrr::map(Bullet1_data, get_sig), sig2 = purrr::map(Bullet2_data, get_sig))
plan(multicore)
comparisons <- comparisons %>%
mutate(Aligned = future_map2(sig1, sig2, ~sig_align(unlist(.x), unlist(.y)))) # Getting Aligned signatures
# Get striae
comparisons <- comparisons %>%
mutate(Striae = future_map(Aligned, sig_cms_max)) # Obtaining Striae
saveRDS(select(comparisons, -Bullet1_data, -Bullet2_data), file = "Hamby_173_252_Comparisons.rda")
comparisons <- comparisons %>%
select(-Bullet1_data, -Bullet2_data)
plan(multicore(workers = availableCores(constraints = 8))) # Core Restraint so server does not die
comparisons <- comparisons %>%
mutate(features = future_map2(.x = Aligned, .y = Striae, .f = extract_features_all, resolution = 1.5625)) #ObtainingFeatures
comparisons <- comparisons %>%
mutate(Legacy_Features = future_map(Striae, extract_features_all_legacy, resolution = 1.5625)) # Obtaining feature leacy
comparisons <- comparisons %>%
tidyr::unnest(Legacy_Features) # Extracting feature legacy
comparisons <- comparisons %>%
mutate(Set = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\1", Bullet2)) # Creating columns from IDs
comparisons <- comparisons %>%
mutate(BarrelA = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\2", Bullet2))
comparisons <- comparisons %>%
mutate(BarrelB = gsub("(Hamby173|Hamby252)-([0-9]{0,2}|Unknown)-([1-2A-Z])-([1-6])", "\\2", Bullet1))
comparisons <- comparisons %>%
mutate(BulletA = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\3", Bullet2))
comparisons <- comparisons %>%
mutate(BulletB = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\3", Bullet1))
comparisons <- comparisons %>%
mutate(LandA = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\4", Bullet2))
comparisons <- comparisons %>%
mutate(LandB = gsub("(Hamby173|Hamby252)-([0-9]{1,2}|Unknown)-([1-2A-Z])-([1-6])", "\\4", Bullet1))
head(comparisons, 500)
## # A tibble: 500 x 21
## Bullet1 Bullet2 ccf rough_cor lag D sd_D signature_length
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Hamby1… Hamby1… 1 1 0 0 0.00489 1.64
## 2 Hamby1… Hamby1… 0.350 0.350 0.116 0.00259 0.00479 2.02
## 3 Hamby1… Hamby1… 0.519 0.519 -0.164 0.00186 0.00348 1.88
## 4 Hamby1… Hamby1… 0.220 0.220 0.146 0.00295 0.00478 2.09
## 5 Hamby1… Hamby1… 0.292 0.292 0.035 0.00250 0.00442 1.80
## 6 Hamby1… Hamby1… 0.248 0.248 0 0.00284 0.00458 1.78
## 7 Hamby1… Hamby1… 0.248 0.248 -0.088 0.00222 0.00377 1.76
## 8 Hamby1… Hamby1… 0.287 0.287 0.115 0.00233 0.00425 1.94
## 9 Hamby1… Hamby1… 0.258 0.258 -0.113 0.00438 0.00683 1.66
## 10 Hamby1… Hamby1… 0.271 0.271 0.159 0.00246 0.00370 1.89
## # … with 490 more rows, and 13 more variables: overlap <dbl>,
## # matches <dbl>, mismatches <dbl>, cms <dbl>, non_cms <dbl>,
## # sum_peaks <dbl>, Set <chr>, BarrelA <chr>, BarrelB <chr>,
## # BulletA <chr>, BulletB <chr>, LandA <chr>, LandB <chr>
features_2017 <- read_csv("features-hamby.csv") #Reading in csv
## Parsed with column specification:
## cols(
## .default = col_double(),
## match = col_logical(),
## study1 = col_character(),
## study2 = col_character(),
## barrel2 = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 31240 parsing failures.
## row col expected actual file
## 90507 barrel1 a double A 'features-hamby.csv'
## 90508 barrel1 a double A 'features-hamby.csv'
## 90509 barrel1 a double A 'features-hamby.csv'
## 90510 barrel1 a double A 'features-hamby.csv'
## 90511 barrel1 a double A 'features-hamby.csv'
## ..... ....... ........ ...... ....................
## See problems(...) for more details.
features_2017 <- features_2017 %>%
select(-land1_id, -land2_id) %>% # removing
filter(study1 != "Cary" & study2 != "Cary") %>% # removing
rename(BarrelB = barrel1, BulletB = bullet1, LandB = land1) %>% # Changed column names
rename(BarrelA = barrel2, BulletA = bullet2, LandA = land2) %>% # Changed column names
select(-study1) %>%
rename(Set = study2) %>%
mutate(Set = gsub("Hamby44", "Hamby173", Set)) %>% #Chnaging observation name
mutate(Set = factor(Set, levels = c("Hamby173", "Hamby252"))) %>% # for ordering principles
rename(ccf2 = ccf, rough_cor2 = rough_cor, lag2 = lag, D2 = D, sd_D2 = sd_D, signature_length2 = signature_length, overlap2 = overlap, matches2 = matches, mismatches2 = mismatches, cms2x = cms, non_cms2 = non_cms, sum_peaks2 = sum_peaks) # Column names changed for comparing purposes
#Exploring we see that all lettered Barrels only have bullet equal to 1. No need to worry about a lettered barrel having a bullet 2
# Code will not look like this forever... Duct Tape... Will Dplyr soon...
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "A"] <- "A") #Assign A to Bullets with column "BarrelA" equalequal to "A"
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "B"] <- "B")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "C"] <- "C")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "D"] <- "D")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "E"] <- "E")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "F"] <- "F")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "G"] <- "G")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "H"] <- "H")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "I"] <- "I")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "J"] <- "J")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "L"] <- "L")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "M"] <- "M")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "N"] <- "N")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "Q"] <- "Q")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "R"] <- "R")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "S"] <- "S")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "U"] <- "U")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "V"] <- "V")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "W"] <- "W")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "X"] <- "X")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "Y"] <- "Y")
features_2017 <- within(features_2017, BulletA[BulletA == 1 & BarrelA == "Z"] <- "Z")
features_2017$BarrelA[features_2017$BarrelA == "A"] <- "Unknown" #Group lettered Barrels together into Barrel "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "B"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "C"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "D"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "E"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "F"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "G"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "H"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "I"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "J"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "L"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "M"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "N"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "Q"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "R"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "S"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "U"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "V"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "W"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "X"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "Y"] <- "Unknown"
features_2017$BarrelA[features_2017$BarrelA == "Z"] <- "Unknown"
features_2017 <- features_2017 %>%
mutate(Bullet1 = paste(Set, BarrelB, BulletB, LandB, sep = "-"),
Bullet2 = paste(Set, BarrelA, BulletA, LandA, sep = "-")) # Creating ID similar to Heike's
features_2017 <- features_2017[order(features_2017$Set), ] #Ordered Set column so that all 173 observations came before 252 observations
# At first glance Hamby173 Barrel 10, Bullet 1, Land 1 seems to be missing
features_2017 <- features_2017 %>%
mutate(BarrelB = as.character(BarrelB),
BulletA = as.character(BulletA),
BulletB = as.character(BulletB),
LandA = as.character(LandA),
LandB = as.character(LandB))
table(comparisons$BarrelA)
##
## 1 10 2 3 4 5 6 7 8
## 5040 5040 5040 5040 5040 5040 5040 5040 5040
## 9 Unknown
## 5040 37800
table(comparisons$BulletA)
##
## 1 2 A B C D E F G H I J
## 25200 25200 1260 2520 2520 1260 2520 1260 1260 2520 1260 1260
## L M N Q R S U V W X Y Z
## 2520 2520 1260 1260 1260 1260 2520 1260 1260 2520 1260 1260
table(comparisons$LandA)
##
## 1 2 3 4 5 6
## 14700 14700 14700 14700 14700 14700
#----------------------------------------------------------------#
table(features_2017$BarrelA)
##
## 1 10 2 3 4 5 6 7 8
## 852 276 1428 1909 2438 3108 3519 4236 4812
## 9 Unknown
## 5152 56936
table(features_2017$BulletA)
##
## 1 2 A B C D E F G H I J
## 13570 14160 1431 3102 3498 2031 3606 2103 1575 3750 1647 2175
## L M N Q R S U V W X Y Z
## 3894 3677 1749 1900 1785 2313 3776 1857 1893 4308 2415 2451
table(features_2017$LandA)
##
## 1 2 3 4 5 6
## 13927 13781 14401 13411 14538 14608
library(viridis)
## Loading required package: viridisLite
comparisons %>% #Comparison heatmap for 2019 data
filter(!is.na(BarrelA)) %>%
filter(!is.na(BarrelB)) %>%
group_by(BarrelA, BarrelB, Set) %>%
summarise(count = n()) %>%
ggplot(aes(x = BarrelA, y = BarrelB))+
geom_tile(aes(fill = count))+
scale_fill_viridis(option = "plasma", direction = -1)+
geom_text(aes(label = count))+
ggtitle("Comparison Tiles")+
theme_bw()+
facet_grid(~Set) #Good way to compare graphs below. Shows how bad "features-hamby.csv" really is.
comparisons %>%
filter(!is.na(BarrelA)) %>%
filter(!is.na(BarrelB)) %>%
group_by(BarrelA, BarrelB, Set) %>% summarise(count = max(signature_length)) %>%
ggplot(aes(x = BarrelA, y = BarrelB))+
geom_tile(aes(fill = count))+
scale_fill_viridis(option = "plasma", direction = -1)+
geom_text(aes(label = count))+
ggtitle("Comparison Signature_Length Tiles")+
theme_bw()+
facet_grid(~Set)
features_2017 %>% # features_2017 plot 1
filter(!is.na(BarrelA)) %>%
filter(!is.na(BarrelB)) %>%
group_by(BarrelA, BarrelB, Set) %>%
summarise(count = n()) %>%
ggplot(aes(x = BarrelA, y = BarrelB))+
geom_tile(aes(fill = count))+
scale_fill_viridis(option = "plasma", direction = -1)+
geom_text(aes(label = count))+
ggtitle("Comparison Count Version 1")+
theme_bw()+ theme(axis.text.x = element_text(angle = 10))+
facet_grid(~Set) #Very Promising Results. This geom_tile/heat_map shows my analysis in visual format.
features_2017 %>% # features_2017 plot 2
filter(!is.na(BarrelA)) %>%
filter(!is.na(BarrelB)) %>%
group_by(BarrelA, BarrelB, Bullet1, Bullet2, Set) %>%
arrange(desc(signature_length2)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
group_by(BarrelA, BarrelB, Set) %>%
summarise(count = n()) %>%
ggplot(aes(x = BarrelA, y = BarrelB))+
geom_tile(aes(fill = count))+
scale_fill_viridis(option = "plasma", direction = -1)+
geom_text(aes(label = count))+
ggtitle("Comparison Count Version 2")+
theme_bw()+ theme(axis.text.x = element_text(angle = 10))+
facet_grid(~Set)
We shall use the comparison plot from our 2019 data to help compare our data from 2017. Looking at features_2017 plot1 we can see that my analysis about duplicate data seems to be correct. Looking at features_2017 plot2 we can see that there is also missing data on the Barrel-Bullet-Land level. Both of these visual representations are also written in the word.document
# Let's try to find those missing values with an easy way
# Inspiration came from Susan and
#https://www.r-bloggers.com/r-sorting-a-data-frame-by-the-contents-of-a-column/
#https://www.rdocumentation.org/packages/base/versions/3.6.0/topics/t
comparisons_check <- comparisons %>%
select(Bullet1, Bullet2) #Remove Columns we wont be using
features_2017_check <- features_2017 %>%
select(Bullet1, Bullet2) # Remove Columns we wont be using
finder <- t(apply(comparisons_check,1,sort))
finder <- data.frame(finder)
#Take Transpose and Sort in order... Now we have:
#AB => AB
#BA => AB
finder <- finder[!duplicated(finder),] # Remove duplicates
finder <- finder %>%
rename(Bullet2 = X2, Bullet1 = X1) %>%
select(Bullet1, Bullet2)
finder <- anti_join(finder, features_2017_check, by = c("Bullet2")) # Shows what data is missing and if you add Bullet1 to the anti_join you can see all thee missing Unknown|Unknown Comparisons
## Warning: Column `Bullet2` joining factor and character vector, coercing
## into character vector
#Better version of above method I think, using dplyr
comparison_split <- comparisons %>% #Assign Comparisons
rowwise() %>% #Every Row
mutate(sorter = paste(sort(c(Bullet1, Bullet2)), collapse="-")) %>% # create col "sorter", which pastes both cols and sorts in order
distinct(sorter, .keep_all=T) %>% # remove all rows with duplicate values in sorter
select(-sorter) # no need for column
feature_id1 <- features_2017[!duplicated(features_2017[c("Bullet1","Bullet2")]),] #profile1
feature_id2 <- anti_join(features_2017, feature_id1) #profile2
## Joining, by = c("compare_id", "profile1_id", "profile2_id", "ccf2", "rough_cor2", "lag2", "D2", "sd_D2", "signature_length2", "overlap2", "matches2", "mismatches2", "cms2x", "non_cms2", "sum_peaks2", "match", "BarrelB", "BulletB", "LandB", "Set", "BarrelA", "BulletA", "LandA", "Bullet1", "Bullet2")
feature_id1 <- na.omit(feature_id1) # Remove Na
feature_id2 <- na.omit(feature_id2) # Remove Na
scatter1 <- inner_join(comparison_split, feature_id1, by = c("Bullet1", "Bullet2")) #ScatterPlot Comparisons for profile_id version 1
scatter2 <- inner_join(comparison_split, feature_id2, by = c("Bullet1", "Bullet2")) # ScatterPlot Comparisons for Profile_id version 2
scatter1 %>% ggplot(aes(x = ccf, y = ccf2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("ccf Vs ccf2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter2 %>% ggplot(aes(x = ccf, y = ccf2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("ScatterPlot2 ccf Vs ccf2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter1 %>% ggplot(aes(x = rough_cor, y = rough_cor2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("rough_cor Vs rough_cor2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter2 %>% ggplot(aes(x = rough_cor, y = rough_cor2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("ScatterPlot2 rough_cor Vs rough_cor2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter1 %>% ggplot(aes(x = cms, y = cms2x))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("cms Vs cms2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter2 %>% ggplot(aes(x = cms, y = cms2x))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("ScatterPlot2 cms Vs cms2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter1 %>% ggplot(aes(x = sum_peaks, y = sum_peaks2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("sum_peaks Vs sum_peaks2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter2 %>% ggplot(aes(x = sum_peaks, y = sum_peaks2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("ScatterPlot2 sum_peaks Vs sum_peaks2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter1 %>% ggplot(aes(x = signature_length, y = signature_length2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("signature_length Vs signature_length2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter2 %>% ggplot(aes(x = signature_length, y = signature_length2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("ScatterPlot2 signature_length Vs signature_length2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter1 %>% ggplot(aes(x = D, y = D2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("Distance Vs Distance2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatter2 %>% ggplot(aes(x = D, y = D2))+
geom_bin2d(bins = 65)+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
theme_bw()+
ggtitle("ScatterPlot2 Distance Vs Distance2")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatterA <- gather(scatter1, key="measure", value="value", c("cms", "non_cms", "matches", "mismatches", "D", "sum_peaks", "signature_length", "ccf2", "D2", "signature_length2", "matches2", "mismatches2", "cms2x", "non_cms2", "sum_peaks2"))
scatterB <- gather(scatter1, key="measure", value="value", c("cms", "non_cms", "matches", "mismatches", "D", "sum_peaks", "signature_length", "ccf", "D2", "signature_length2", "matches2", "mismatches2", "cms2x", "non_cms2", "sum_peaks2"))
scatterA %>%
ggplot(aes(x = ccf, y = value))+
geom_point()+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
facet_wrap(~measure, ncol = 4, scale = "free")+ scale_x_log10()+
theme_bw()+
ggtitle("ccf VS other features")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scatterB %>%
ggplot(aes(x = ccf2, y = value))+
geom_point()+
geom_smooth(method = lm)+
geom_smooth(aes(colour = "red"), show.legend = FALSE)+
facet_wrap(~measure, ncol = 4, scale = "free")+
theme_bw()+
ggtitle("ccf2 VS other features")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#scatterA %>%
#ggplot(aes(x = cms, y = value))+
#geom_point()+
#geom_smooth(method = lm)+
#geom_smooth(aes(colour = "red"), show.legend = FALSE)+
#facet_wrap(~measure, ncol = 4, scale = "free")+
#theme_bw()+
#ggtitle("cms VS other features")
#scatterB %>%
#ggplot(aes(x = cms2x, y = value))+
#geom_point()+
#geom_smooth(method = lm)+
#geom_smooth(aes(colour = "red"), show.legend = FALSE)+
#facet_wrap(~measure, ncol = 4, scale = "free")+
#theme_bw()+
#ggtitle("cms2x VS other features")
#---------------------------------------------------------------------------------------#
#CHECK
Looking at the normal/facet_grid scatterplots we can make some observations but before that, let me go over some details. Two geom_smooth() layers were added. The blue line represents a lm method and the red line represents the gam method. Looking at the graphs where each feature is compared with its other version, we see there is no linear relationship between x and y. For the graphs where ccf, ccf2, cms, and cms2 are compared to the other features, we also see no linear relationships between x and y. The scatterplots look like huge clouds.